library(Seurat)
## Loading required package: ggplot2
## Loading required package: cowplot
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
## 
##     ggsave
## Loading required package: Matrix
library(scmap)
## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'
library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:Matrix':
## 
##     colMeans, colSums, rowMeans, rowSums, which
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, cbind, colMeans,
##     colnames, colSums, do.call, duplicated, eval, evalq, Filter,
##     Find, get, grep, grepl, intersect, is.unsorted, lapply,
##     lengths, Map, mapply, match, mget, order, paste, pmax,
##     pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce,
##     rowMeans, rownames, rowSums, sapply, setdiff, sort, table,
##     tapply, union, unique, unsplit, which, which.max, which.min
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:Matrix':
## 
##     expand
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:base':
## 
##     apply
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:matrixStats':
## 
##     count
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following objects are masked from 'package:GenomicRanges':
## 
##     intersect, setdiff, union
## The following object is masked from 'package:GenomeInfoDb':
## 
##     intersect
## The following objects are masked from 'package:IRanges':
## 
##     collapse, desc, intersect, setdiff, slice, union
## The following objects are masked from 'package:S4Vectors':
## 
##     first, intersect, rename, setdiff, setequal, union
## The following objects are masked from 'package:BiocGenerics':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Data loading and inspection of the metadata.

load('/data/pub-others/tabula_muris/figshare/180126-facs/maca.seurat_obj.facs.figshare_180126.RData')
head(seurat_obj@meta.data)
sce_maca <- as.SingleCellExperiment(seurat_obj)
all10x <- readRDS('/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/10x-180504')
sce_10x <- as.SingleCellExperiment(all10x)

#convert maca gene names to uppercase to match 10x gene names
rowData(sce_maca)['feature_symbol'] <- unlist(lapply(rowData(sce_maca)$gene, function(x){return(toupper(x))}))
rowData(sce_10x)['feature_symbol'] <- rowData(sce_10x)$gene

counts(sce_10x) <- as.matrix(counts(sce_10x))
logcounts(sce_10x) <- as.matrix(logcounts(sce_10x))

counts(sce_maca) <- as.matrix(counts(sce_maca))
logcounts(sce_maca) <- as.matrix(logcounts(sce_maca))
sce_maca <- selectFeatures(sce_maca, suppress_plot = FALSE)

Running scmap on the whole MACA dataset

All the different cell types.

as.data.frame(unique(seurat_obj@meta.data$tissue_cell_type))

Setting the right column for clustering.

sce_maca <- indexCluster(sce_maca, cluster_col = 'tissue_cell_type')

Predicting cell types in our dataset.

scmapCluster_results <- scmapCluster(
  projection = sce_10x, 
  index_list = list(
    sce_maca = metadata(sce_maca)$scmap_cluster_index
  ),
  threshold=0.5  #default=0.7 
)
## Warning in setFeatures(projection, rownames(index)): Features
## 1190002H23RIK, 2210415F13RIK, 4632428N05RIK, 8430408G22RIK, ADH1, C1QA,
## C1QB, C1QC, C1RA, C330006A16RIK, CAR2, CCL6, CD24A, CD53, CKMT1, CXCR7,
## CYP4B1, ERCC-00009, ERCC-00042, ERCC-00092, ERCC-00108, ERCC-00111,
## ERCC-00116, FCGR2B, FCGR3, FCRLS, FTL1, GIMAP6, GPI1, GPIHBP1, GPR34, H2-
## AA, H2-AB1, H2-D1, H2-DMA, H2-DMB2, H2-EB1, H2-K1, HMGCS2, IL11RA1, LASS2,
## LGALS7, LRRC33, LY6A, LY6C1, LY86, LYZ2, MS4A1, MT1, MT2, P2RY13, PGLYRP1,
## PKM2, SERPINB1A, SFPI1, SIGLECH, TPRGL, TRF, TUBB5 are not present in the
## 'SCESet' object and therefore were not set.
predictions_all <- as.data.frame(table(scmapCluster_results$scmap_cluster_labs)) 
predictions_all[order(-predictions_all$Freq),]

A lot of the cells got labeled unassigned (22,101 cells from a total of 56,371). This is much less when only using the old MACA fat dataset as reference (then, only 8,110 cells got labeled unassigned). This could be because there are multiple celltypes in the dataset that are likely the same (for example, adipose mesenchymal stem cells, bladder mesenchymal stem cells and thymus mesenchymal stem cells are all mesenchymal stem cells). Scmap requires that for every cell in our dataset to be assigned a certain celltype, the 3 nearest neighbors from the reference dataset should have that celltype. So say we have some mesenchymal cells in our data, then a lot of our cells will be unassigned because the nearest neighbors will be both bladder mesenchymal, thymus mesenchymal and adipose mesenchymal.

A lot of annotations for mammary stromal cell. Mesenchymal cells are multipotent stromal cells: https://en.wikipedia.org/wiki/Mesenchymal_stem_cell. Stromal cells are connective tissue cells of any organ.

Sankey diagram of how the annotations map to the clusters in our data (cluster 12 = mixture cluster):

plot(
  getSankey(
    colData(sce_10x)$res.0.5, 
    scmapCluster_results$scmap_cluster_labs[,"sce_maca"],
    plot_height = 400
  )
)
## starting httpd help server ... done
#Link: http://yggdrasil:7000/custom/googleVis/SankeyID4d53a14fdfb.html

The annotations for the mixture cluster (total of 1,139 cells, of which 376 are unassigned). Most cell types are only assigned for a few cells, but two celltypes have a higher number of assignments:
Mamary basal cell (333) Marrow hematopoietic stem cell (208)

pred_mixt <- as.data.frame(table(scmapCluster_results$scmap_cluster_labs[which(colData(sce_10x)$res.0.5 %in% 12),'sce_maca']))
pred_mixt[order(-pred_mixt$Freq),]

Labeling in the tSNE.

#To prevent the plot being not visible because of too many labels.

predicted_labels_all <- as.data.frame(
    row.names=rownames(sce_10x@colData), 
    x=as.vector(scmapCluster_results$scmap_cluster_labs))
names(predicted_labels_all) <- 'predicted_labels'
all10x <- AddMetaData(all10x, metadata=predicted_labels_all, col.name='predicted_labels')
TSNEPlot(all10x, group.by='predicted_labels', pt.size=1, do.label=T, label.size=6)

Labeling in tSNE with low frequency annotations removed (less than 300).

celltypes_unassign <- predictions_all[predictions_all$Freq < 300, 'Var1']
all10x@meta.data['predictions_all_clean'] <- all10x@meta.data$predicted_labels
all10x@meta.data[which(all10x@meta.data$predicted_labels %in% celltypes_unassign), 'predictions_all_clean'] = 'unassigned'

TSNEPlot(all10x, group.by='predictions_all_clean', pt.size=0.5)

TSNEPlot(all10x, group.by='sample_name', pt.size=0.1, do.label=T)

Running scmap on the MACA Fat subset

Cell types in the fat dataset.

seurat_obj@meta.data %>% filter(tissue=="Fat") %>% distinct(tissue_cell_type)

Subsetting and preparing the data.

maca_fat <- SubsetData(SetAllIdent(seurat_obj, id='tissue'), ident.use="Fat")
sce_maca_fat <- as.SingleCellExperiment(maca_fat)
rowData(sce_maca_fat)['feature_symbol'] <- unlist(lapply(rowData(sce_maca_fat)$gene, function(x){return(toupper(x))}))
counts(sce_maca_fat) <- as.matrix(counts(sce_maca_fat))
logcounts(sce_maca_fat) <- as.matrix(logcounts(sce_maca_fat))
sce_maca_fat <- selectFeatures(sce_maca_fat, suppress_plot = FALSE)

Setting the right column for clustering.

sce_maca_fat <- indexCluster(sce_maca_fat, cluster_col = 'cell_ontology_class')

Predicting cell types in our dataset.

scmapCluster_results_fat <- scmapCluster(
  projection = sce_10x, 
  index_list = list(
    sce_maca_fat = metadata(sce_maca_fat)$scmap_cluster_index
  ),
  threshold=0.5  #default=0.7 
)
## Warning in setFeatures(projection, rownames(index)): Features
## 1190002H23RIK, 8430408G22RIK, ADH1, AW112010, C1RA, C4B, CAR4, CCL6,
## CCL9, CCR2, CD2, CD24A, CD48, CD53, CXCR7, CYB5, CYBB, CYP4B1, CYP4F18,
## ERCC-00009, ERCC-00108, F13A1, FCGR2B, FCGR3, GIMAP3, GIMAP6, GM11428,
## GPIHBP1, H2-AA, H2-AB1, H2-D1, H2-DMA, H2-DMB1, H2-DMB2, H2-EB1, H2-K1, H2-
## OB, H2-Q6, HMGCS2, IFI205, IFI27L2A, IL11RA1, LILRB4, LRRC33, LY6A, LY6C1,
## LY86, LYZ2, MGL2, MMP23, MRC1, MS4A1, MS4A4B, MS4A4C, MS4A4D, MS4A6B,
## MS4A6C, MT1, NEURL3, PECAM1, PGCP, RETNLA, SERPINB6A, SFPI1, SLFN2, SPNB2,
## TPRGL, TRF are not present in the 'SCESet' object and therefore were not
## set.
pred_fat <- as.data.frame(table(scmapCluster_results_fat$scmap_cluster_labs))
pred_fat <- pred_fat[order(-pred_fat$Freq),]
pred_fat

Now less cells are unassigned. The “mesenchymal stem cell of adipose” are likely the “multipotent progenitors” from the old MACA dataset (similar nr of annotations and similar colouring in tSNE (see below)).

Predictions in cluster 12:

pred_mixt_fat <- as.data.frame(table(scmapCluster_results_fat$scmap_cluster_labs[which(colData(sce_10x)$res.0.5 %in% 12), 'sce_maca_fat']))
pred_mixt_fat[order(-pred_mixt_fat$Freq),]

Interestingly, a lot of epithelial cell predictions and not that much mesenchymal stem cell predictions.

Sankey diagram:

plot(
  getSankey(
    colData(sce_10x)$res.0.5, 
    scmapCluster_results_fat$scmap_cluster_labs[,"sce_maca_fat"],
    plot_height = 400
  )
)
#Link: http://yggdrasil:7000/custom/googleVis/SankeyID4d539a2079.html
predicted_labels_fat <- as.data.frame(
    row.names=rownames(sce_10x@colData), 
    x=as.vector(scmapCluster_results_fat$scmap_cluster_labs))
names(predicted_labels_fat) <- 'predicted_labels_fat'
all10x <- AddMetaData(all10x, metadata=predicted_labels_fat, col.name='predicted_labels_fat')
TSNEPlot(all10x, group.by='predicted_labels_fat', pt.size=0.1, do.label=T)

TSNEPlot(all10x, group.by='sample_name', pt.size=0.1, do.label=T)

Inspection of the MACA data

I performed PCA and tSNE (20 pc’s) on the MACA data.

maca <- readRDS('/projects/pytrik/sc_adipose/analyze_10x_fluidigm/data/downloads/MACA_data_PCA_tSNE')

Coloured by tissue.

TSNEPlot(maca, group.by='tissue', label.size=6, do.label=T)
## Warning: Removed 1 rows containing missing values (geom_text).

All mesenchymal cells

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[grep('mesenchymal', maca@meta.data$cell_ontology_class)], cols.highlight='blue', cols.use='gray')

Only the mesenchymal stem cells from adipose

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[maca@meta.data$cell_ontology_class == 'mesenchymal stem cell of adipose'], cols.highlight='blue', cols.use='gray')

Let’s see where some of the most frequent predicted cell types are in the MACA data.

The predictions for our data when using all MACA data:

predictions_all <- predictions_all[order(-predictions_all$Freq),]
predictions_all

Mammary stromal cells (11453 annotations). Wikipedia says mesenchymal cells are multipotent stromal cells: https://en.wikipedia.org/wiki/Mesenchymal_stem_cell. Stromal cells are connective tissue cells of any organ: https://en.wikipedia.org/wiki/Stromal_cell. They are in the same cluster as the mesenchymal cells (see DimPlot above).

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[maca@meta.data$tissue_cell_type == 'Mammary_stromal cell'], cols.highlight='blue', cols.use='gray')

Mammary basal cells (6960 annotations). I can not find a clear description of what mammary basal cells are. From http://www.zen-bio.com/products/cells/human-mammary-cells.php: “Basal myoepithelial cell surround the luminal cells and have both contractile muscle and epithelial properties. Basal myoepithelial cells tend to dominate normal primary cultures. Additionally, basal-type breast cancers are typically estrogen and progesterone receptor negative.” Interestingly, they are not in the mesenchymal cluster.

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[maca@meta.data$tissue_cell_type == 'Mammary_basal cell'], cols.highlight='blue', cols.use='gray')

Which cells in our data were labeled as mammary basal cells?

DimPlot(all10x, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=all10x@cell.names[all10x@meta.data$predicted_labels == 'Mammary_basal cell'], cols.highlight='blue', cols.use='gray')

And which were labeled as mammary stromal cells?

DimPlot(all10x, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=all10x@cell.names[all10x@meta.data$predicted_labels == 'Mammary_stromal cell'], cols.highlight='blue', cols.use='grey')

Interesting to see. We can see the ‘signal’ from mammary stromal cells to basal cells going from the top left to the bottom right in the tSNE plot.

Continuing with seeing where the most frequent predicted celltypes are. Trachea_epithelial cell (6924). They are both in the mesenchymal cluster.

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[maca@meta.data$tissue_cell_type == 'Trachea_epithelial cell'], cols.highlight='blue', cols.use='gray')

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[maca@meta.data$tissue_cell_type == 'Aorta_fibroblast'], cols.highlight='blue', cols.use='gray')

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[maca@meta.data$tissue_cell_type == 'Aorta_fibroblast'], cols.highlight='blue', cols.use='gray')

Where are the Trachea_epithelial cell and Aorta_fibroblast in our data?

DimPlot(all10x, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=all10x@cell.names[all10x@meta.data$predicted_labels == 'Trachea_epithelial cell'], cols.highlight='blue', cols.use='gray')

DimPlot(all10x, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=all10x@cell.names[all10x@meta.data$predicted_labels == 'Aorta_fibroblast'], cols.highlight='blue', cols.use='gray')

Inspection of the MACA Fat data

The predictions for the our whole dataset when only using the Fat subset:

pred_fat

Where are the mesenchymal cells, epithelial cells and smooth muscle cells in the MACA data?

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[maca@meta.data$tissue_cell_type == 'Fat_mesenchymal stem cell of adipose'], cols.highlight='blue', cols.use='gray')

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[maca@meta.data$tissue_cell_type == 'Fat_epithelial cell'], cols.highlight='blue', cols.use='gray')

DimPlot(maca, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=maca@cell.names[maca@meta.data$tissue_cell_type == 'Fat_smooth muscle cell'], cols.highlight='blue', cols.use='gray')

And in our data?

DimPlot(all10x, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=all10x@cell.names[all10x@meta.data$predicted_labels_fat == 'mesenchymal stem cell of adipose'], cols.highlight='blue', cols.use='gray')

DimPlot(all10x, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=all10x@cell.names[all10x@meta.data$predicted_labels_fat == 'epithelial cell'], cols.highlight='blue', cols.use='gray')

DimPlot(all10x, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=all10x@cell.names[all10x@meta.data$predicted_labels_fat == 'smooth muscle cell'], cols.highlight='blue', cols.use='gray')

Closer inspection MACA data for mixture cluster

The predictions when using all MACA data:

pred_mixt_ordered <- pred_mixt[order(-pred_mixt$Freq),]
pred_mixt_ordered

The predictions when only using the Fat data:

pred_mixt_fat[order(-pred_mixt_fat$Freq),]

When using all MACA data, most cells in the mixture cluster got assigned as mammary basal cells or hematopoietic stem cells. When only using the Fat data, most cells got assigned as epithelial cell.

Are the mammary basal cells and hematopoetic stem cells close to the Fat epithelial cells in the MACA data?

maca@meta.data['annotations_mixture'] <- maca@meta.data$tissue_cell_type
maca@meta.data[which(!(maca@meta.data$annotations_mixture %in% pred_mixt_ordered$Var1[1:5])), 'annotations_mixture'] = 'rest' 
maca@meta.data[maca@meta.data$tissue_cell_type == 'Fat_epithelial cell', 'annotations_mixture'] = 'Fat_epithelial cell'

TSNEPlot(maca, group.by='annotations_mixture')

TSNEPlot(maca, group.by='annotations_mixture', cells.use=maca@cell.names[maca@meta.data$annotations_mixture != 'rest'])

Summary

When using the Fat subset as a reference, most cells got assigned as mesenchymal stem cell. The mixture cluster got assigned as epithelial cells and cells close to the mixture cluster got assigned as smooth muscle cells.

When using the whole MACA dataset as a reference, most cells in our dataset got assigned mammary stromal cell and mammary basal cell. Mesenchymal cells are also called multipotent stromal cells, and the mammary stromal cells are in the same cluster as the mesenchymal cells in the MACA data. The mammary basal cells don’t group in the mesenchymal cluster in the MACA data. The pattern of assignments in our data is interesting to see (see plots below). It does not seem to be related to the subtissue. Could the pattern represent the development of our preadipocytes? Cells getting the assignment basal cell could be further developed than cells getting the assignment stromal cell.

DimPlot(all10x, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=all10x@cell.names[all10x@meta.data$predicted_labels == 'Mammary_stromal cell'], cols.highlight='blue', cols.use='gray')

DimPlot(all10x, dim.1=1, dim.2=2, reduction.use='tsne', cells.highlight=all10x@cell.names[all10x@meta.data$predicted_labels == 'Mammary_basal cell'], cols.highlight='blue', cols.use='gray')

TSNEPlot(all10x, group.by='sample_name', pt.size=0.1, do.label=T)